Loading Libraries and Data Set

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)
library(dplyr)
main_df<-read_csv("/Users/abbysommers/Desktop/Summer Research Project/NFL_Data_2011-2023.csv")
## Rows: 416 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): LOCATION, TEAM, CONFERENCE, REGION
## dbl (6): YEAR, WINS, LOSSES, W/L_PERCENT, HIGH_PERCENT, LOW_PERCENT
## num (5): TOTAL_ALLOT, HIGHEST_SALARY, LOWEST_SALARY, HI_LOW_DIFF, AVE_SALARY
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
main_df <- main_df %>% 
  rename('W.L_PERCENT' = 'W/L_PERCENT' )

main_df$YEAR <- as.character(main_df$YEAR)

Making Separate Data Sets Based on Conf. and Region

nfc_west<-main_df%>%
  filter(CONFERENCE=='NFC',REGION=='West')
afc_west<-main_df%>%
  filter(CONFERENCE=='AFC',REGION=='West')
nfc_south<-main_df%>%
  filter(CONFERENCE=='NFC',REGION=='South')
afc_south<-main_df%>%
  filter(CONFERENCE=='AFC',REGION=='South')
nfc_north<-main_df%>%
  filter(CONFERENCE=='NFC',REGION=='North')
afc_north<-main_df%>%
  filter(CONFERENCE=='AFC',REGION=='North')
nfc_east<-main_df%>%
  filter(CONFERENCE=='NFC',REGION=='East')
afc_east<-main_df%>%
  filter(CONFERENCE=='AFC',REGION=='East')
head(nfc_west)
## # A tibble: 6 × 15
##   LOCATION TEAM     YEAR  CONFERENCE REGION  WINS LOSSES W.L_PERCENT TOTAL_ALLOT
##   <chr>    <chr>    <chr> <chr>      <chr>  <dbl>  <dbl>       <dbl>       <dbl>
## 1 Arizona  Cardina… 2011  NFC        West       8      8       0.5     118470364
## 2 Arizona  Cardina… 2012  NFC        West       5     11       0.312   117797796
## 3 Arizona  Cardina… 2013  NFC        West      13      3       0.812   120046145
## 4 Arizona  Cardina… 2014  NFC        West      11      5       0.688   133228945
## 5 Arizona  Cardina… 2015  NFC        West      13      3       0.812   144707976
## 6 Arizona  Cardina… 2016  NFC        West       7      9       0.438   155855865
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## #   LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(afc_west)
## # A tibble: 6 × 15
##   LOCATION    TEAM  YEAR  CONFERENCE REGION  WINS LOSSES W.L_PERCENT TOTAL_ALLOT
##   <chr>       <chr> <chr> <chr>      <chr>  <dbl>  <dbl>       <dbl>       <dbl>
## 1 Los Angeles Char… 2011  AFC        West       8      8       0.5     121746575
## 2 Los Angeles Char… 2012  AFC        West       7      9       0.438   125967448
## 3 Los Angeles Char… 2013  AFC        West       9      7       0.562   120452285
## 4 Los Angeles Char… 2014  AFC        West       9      7       0.562   134236725
## 5 Los Angeles Char… 2015  AFC        West       4     12       0.25    140814655
## 6 Los Angeles Char… 2016  AFC        West       5     11       0.312   155470367
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## #   LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(nfc_east)
## # A tibble: 6 × 15
##   LOCATION TEAM   YEAR  CONFERENCE REGION  WINS LOSSES W.L_PERCENT TOTAL_ALLOT
##   <chr>    <chr>  <chr> <chr>      <chr>  <dbl>  <dbl>       <dbl>       <dbl>
## 1 New York Giants 2011  NFC        East       9      7       0.562   119706771
## 2 New York Giants 2012  NFC        East       9      7       0.562   125514604
## 3 New York Giants 2013  NFC        East       7      9       0.438   129256255
## 4 New York Giants 2014  NFC        East       6     10       0.375   131935198
## 5 New York Giants 2015  NFC        East       6     10       0.375   128321422
## 6 New York Giants 2016  NFC        East      11      5       0.688   154901229
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## #   LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(afc_east)
## # A tibble: 6 × 15
##   LOCATION TEAM     YEAR  CONFERENCE REGION  WINS LOSSES W.L_PERCENT TOTAL_ALLOT
##   <chr>    <chr>    <chr> <chr>      <chr>  <dbl>  <dbl>       <dbl>       <dbl>
## 1 Miami    Dolphins 2011  AFC        East       6     10       0.375   132817373
## 2 Miami    Dolphins 2012  AFC        East       7      9       0.438   122292261
## 3 Miami    Dolphins 2013  AFC        East       8      8       0.5     112354370
## 4 Miami    Dolphins 2014  AFC        East       8      8       0.5     142455998
## 5 Miami    Dolphins 2015  AFC        East       6     10       0.375   144878936
## 6 Miami    Dolphins 2016  AFC        East      10      6       0.625   151387347
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## #   LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(nfc_south)
## # A tibble: 6 × 15
##   LOCATION TEAM    YEAR  CONFERENCE REGION  WINS LOSSES W.L_PERCENT TOTAL_ALLOT
##   <chr>    <chr>   <chr> <chr>      <chr>  <dbl>  <dbl>       <dbl>       <dbl>
## 1 Atlanta  Falcons 2011  NFC        South     10      6       0.625   126014378
## 2 Atlanta  Falcons 2012  NFC        South     13      3       0.812   121447956
## 3 Atlanta  Falcons 2013  NFC        South      4     12       0.25    110531645
## 4 Atlanta  Falcons 2014  NFC        South      6     10       0.375   133009495
## 5 Atlanta  Falcons 2015  NFC        South      8      8       0.5     136236065
## 6 Atlanta  Falcons 2016  NFC        South     11      5       0.688   153820903
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## #   LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(afc_south)
## # A tibble: 6 × 15
##   LOCATION    TEAM  YEAR  CONFERENCE REGION  WINS LOSSES W.L_PERCENT TOTAL_ALLOT
##   <chr>       <chr> <chr> <chr>      <chr>  <dbl>  <dbl>       <dbl>       <dbl>
## 1 Jacksonvil… Jagu… 2011  AFC        South      5     11       0.312    92775655
## 2 Jacksonvil… Jagu… 2012  AFC        South      2     14       0.125   129514023
## 3 Jacksonvil… Jagu… 2013  AFC        South      4     12       0.25    120190441
## 4 Jacksonvil… Jagu… 2014  AFC        South      3     13       0.188   129184374
## 5 Jacksonvil… Jagu… 2015  AFC        South      5     11       0.312   136130404
## 6 Jacksonvil… Jagu… 2016  AFC        South      3     13       0.188   150459217
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## #   LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(nfc_north)
## # A tibble: 6 × 15
##   LOCATION TEAM  YEAR  CONFERENCE REGION  WINS LOSSES W.L_PERCENT TOTAL_ALLOT
##   <chr>    <chr> <chr> <chr>      <chr>  <dbl>  <dbl>       <dbl>       <dbl>
## 1 Chicago  Bears 2011  NFC        North      8      8       0.5     107756643
## 2 Chicago  Bears 2012  NFC        North     10      6       0.625   128921164
## 3 Chicago  Bears 2013  NFC        North      8      8       0.5     127169835
## 4 Chicago  Bears 2014  NFC        North      5     11       0.312   131432644
## 5 Chicago  Bears 2015  NFC        North      6     10       0.375   141512728
## 6 Chicago  Bears 2016  NFC        North      3     13       0.188   148323553
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## #   LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(afc_north)
## # A tibble: 6 × 15
##   LOCATION   TEAM   YEAR  CONFERENCE REGION  WINS LOSSES W.L_PERCENT TOTAL_ALLOT
##   <chr>      <chr>  <chr> <chr>      <chr>  <dbl>  <dbl>       <dbl>       <dbl>
## 1 Cleaveland Browns 2011  AFC        North      4     12      0.25     103839520
## 2 Cleaveland Browns 2012  AFC        North      5     11      0.312    127350645
## 3 Cleaveland Browns 2013  AFC        North      4     12      0.25     110066758
## 4 Cleaveland Browns 2014  AFC        North      7      9      0.438    135401214
## 5 Cleaveland Browns 2015  AFC        North      3     13      0.188    139647560
## 6 Cleaveland Browns 2016  AFC        North      1     15      0.0625   130222141
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## #   LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
afc_east$loc<-"AFC East"
nfc_east$loc<-"NFC East"
afc_west$loc<-"AFC West"
nfc_west$loc<-"NFC West"
nfc_north$loc<-"NFC North"
nfc_south$loc<-"NFC South"
afc_south$loc<-"AFC South"
afc_north$loc<-"AFC North"

main_df <- rbind(afc_east,nfc_east,afc_west,nfc_west,afc_north,nfc_north,afc_south,nfc_south)
#View(main_df)

Preliminary Thoughts

For studying the relationship between win-loss percentages and average salaries over a span of 13 years of NFL data can be an intriguing and complex analysis. Here are some preliminary thoughts and considerations before we dive into this study:

Thoughts-Predictions

Preliminary thoughts can help guide data analysis in terms of formulating certain hypotheses that could answer the main questions that agencies would want to know.

  1. Higher Salaries Lead to Better Performance. We may study the prediction that teams with higher average salaries will have higher win-loss percentages. This explanation could be that higher salaries attract and keep better players, resulting in improved performance on the field.
  2. Salary Inequality and Competitive Balance. It may be hypothesized that leagues with greater salary disparities between teams exhibit more pronounced differences in win-loss percentages.
  3. Temporal Changes. Our predictions might be impacted by salary structures and league regulations that have evolved over the 13-year span. This would influence the relationship between salaries and win-loss percentages.
  4. Market Size and Financial Stability. Larger market teams and those with higher revenue may spend more on salaries and potentially have higher win-loss percentages.

Considerations

  1. Defining Variables Clearly. Must have a solid definition of win-loss percentages and average salaries in the context of NFL data.
  2. Data Quality and Source. Must verify the quality and reliability of the data. NFL data may vary from different sources and there is a possibility of false reporting in regards to salary caps and other sensitive information.
  3. Time Frame and Trends. Must consider a temporal aspect. For instance, there may be trends or patterns in win-loss percentages and average salaries over the span of 13 years. In addition, there may be significant events such as changes in NFL rules, economic factors, or player contracts that influence trends.
  4. Statistical Methods. Consider simple correlations, regression analysis, or time series analysis depending on the complexity of the data.
  5. Other Factors. There may be other invisible variables such as changes in coaching staff, player drafts, strategies, injuries, and league rules that influence the data.

Steps to Analyze

  1. Data Cleaning: Handling missing values, outliers, and ensuring data is correctly formatted.
  2. Exploratory Data Analysis: Using visualizations (histograms, bar charts, scatter plots) to explore the data.
  3. Hypothesis Testing: Formulating and testing hypotheses based on research questions, while also making certain assumptions are satisfied.
  4. Model Building: Fitting appropriate models to the data and checking that the assumptions are met.
  5. Interpretation and Reporting: Interpreting the results and reporting the findings with appropriate statistical evidence.

Descriptive Statistics

Summary Statistics: Calculating means, medians, modes, standard deviations, and ranges for discrete and continuous variables. Frequency Distributions: Counting the frequency and percentages of categorical variables.

Data Cleaning

The data set was manually created by members of the research group to avoid missing values and outliers. It was made sure that the complete data set of all NFL data was formatted correctly and could easily be analyzed in a programming language. The main data frame is called ‘main_df’. The main data frame may be broken down into subsets if necessary for examining a specific part of the data.

Exploratory Data Analysis

Data Visualization - Categorical Data: Bar Graphs - Quantitative Data: Histograms and Scatter Plots

  1. Visualizing Quantitative Data:

WINS

ggplot(main_df, aes(x = WINS)) +
  geom_histogram(binwidth=0.5,fill = "cornflowerblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(y="Frequency",title="Histogram of NFL Wins") +
  scale_x_continuous(breaks=c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15))+scale_y_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60))

The distribution of NFL wins is roughly symmetrical, with most teams have seven wins each year.

LOSSES

ggplot(main_df, aes(x = LOSSES)) +
  geom_histogram(binwidth=0.5,fill = "red3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(y="Frequency",title="Histogram of NFL Losses") +
  scale_x_continuous(breaks=c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16))+scale_y_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60))

The distribution of NFL losses is slightly symmetrical, with most teams having nine losses.

WIN.LOSS_PERCENT

ggplot(main_df, aes(x = W.L_PERCENT)) +
  geom_histogram(binwidth=0.05,fill = "mediumpurple4") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x="Win_Loss Percentage",y="Frequency",title="Histogram of Win-Loss Percentages")+scale_y_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60))

The distribution of NFL win-loss percentages is roughly symmetrical, with most teams having an approximate .500 percentage.

TOTAL ALLOCATION

ggplot(main_df, aes(x = TOTAL_ALLOT)) +
  geom_histogram(fill = "khaki2") +
  labs(x="Total Allocation (in millions)",y="Frequency",title="Histogram of Total Salary Allocation")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Total salary allocation (continous variable) is difficult to graph as this number is different across all teams and years. Total allocation varies depending on numerous factors, giving us a non-normal or symmetrical histogram.

Violin Plot - Total Allocation

AFC<-main_df%>%
  filter(CONFERENCE=="AFC")

ggplot(AFC, aes(x = loc, y = TOTAL_ALLOT, fill = loc)) +
  geom_violin() +
  labs(x = "Affliation", y = "Total Allocation", title = "Violin Plot of Total Allocation by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

NFC<-main_df%>%
  filter(CONFERENCE=="NFC")

ggplot(NFC, aes(x = loc, y = TOTAL_ALLOT, fill = loc)) +
  geom_violin() +
  labs(x = "Affliation", y = "Total Allocation", title = "Violin Plot of Total Allocation by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

HIGHEST SALARY

ggplot(main_df, aes(x = HIGHEST_SALARY)) +
  geom_histogram(fill = "sandybrown") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x="Highest Salary (in millions)",y="Frequency",title="Histogram of Highest Salary")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The highest salary (continuous variable) distribution shows a slightly skewed to right graph. Looking at the histogram, most highest salaries are located mid-way between \(10,000,000\) and \(20,000,000\) dollars.

Violin Plot - Highest Salary

ggplot(AFC, aes(x = loc, y = HIGHEST_SALARY, fill = loc)) +
  geom_violin() +
  labs(x = "Affliation", y = "Highest Salary (in millions)", title = "Violin Plot of Highest Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(NFC, aes(x = loc, y = HIGHEST_SALARY, fill = loc)) +
  geom_violin() +
  labs(x = "Affliation", y = "Highest Salary (in millions)", title = "Violin Plot of Highest Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

LOWEST SALARY

ggplot(main_df, aes(x = LOWEST_SALARY)) +
  geom_histogram(fill = "aquamarine3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x="Lowest Salary (in millions)",y="Frequency",title="Histogram of Lowest Salaries")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Violin Plot - Lowest Salary

ggplot(AFC, aes(x = loc, y = LOWEST_SALARY, fill = loc)) +
  geom_violin() +
  labs(x = "Affliation", y = "Lowest Salary (in millions)", title = "Violin Plot of Lowest Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(NFC, aes(x = loc, y = LOWEST_SALARY, fill = loc)) +
  geom_violin() +
  labs(x = "Affliation", y = "Lowest Salary (in millions)", title = "Violin Plot of Lowest Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

HIGH LOW DIFFERENCE (The difference between the highest and lowest salary)

ggplot(main_df, aes(x = HI_LOW_DIFF)) +
  geom_histogram(fill = "honeydew4") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x="Difference Between Highest and Lowest Salary",y="Frequency",title="Histogram")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

AVERAGE SALARY

ggplot(main_df, aes(x = AVE_SALARY)) +
  geom_histogram(fill = "cyan2") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x="Average Salary (in millions)",y="Frequency",title="Histogram of Average Salary")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

For most teams, the average salary ranges from \(5,000,000\) to \(10,00,000\) dollars.

Violin Plot - Average Salary By Region

ggplot(AFC, aes(x = loc, y = AVE_SALARY, fill = loc)) +
  geom_violin() +
  labs(x = "Affliation", y = "Average Salary (in millions)", title = "Violin Plot of Average Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(NFC, aes(x = loc, y = AVE_SALARY, fill = loc)) +
  geom_violin() +
  labs(x = "Affliation", y = "Average Salary (in millions)", title = "Violin Plot of Average Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

Bar Graphs - NFC

ggplot(nfc_east, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
  geom_bar(stat = "identity") +
  labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The NFC East") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(nfc_west, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
  geom_bar(stat = "identity") +
  labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The NFC West") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(nfc_south, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
  geom_bar(stat = "identity") +
  labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The NFC South") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(nfc_north, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
  geom_bar(stat = "identity") +
  labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The NFC North") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

Bar Graphs - AFC

ggplot(afc_east, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
  geom_bar(stat = "identity") +
  labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The AFC East") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(afc_west, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
  geom_bar(stat = "identity") +
  labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The AFC West") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(afc_north, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
  geom_bar(stat = "identity") +
  labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The AFC North") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

ggplot(afc_south, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
  geom_bar(stat = "identity") +
  labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The AFC South") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))

  1. Looking at Variable Relationships: Scatter Plots Between Two Numerical Variables

Initial Look at the Relationship Between Win-Loss Percentage and Average Salary

main_df$YEAR<-as.character(main_df$YEAR)
ggplot(data = AFC, aes(x = AVE_SALARY, y = W.L_PERCENT,color=loc)) +
  geom_point(size = 2.5) +
  geom_smooth(method = lm, se = FALSE, linetype = "solid") +
  labs(
    x = "Average Salary (in millions)",
    y = "Win-Loss Percentage",
    title = "Win-Loss Percentage vs Average Salary in the AFC Conference"
  ) +
  scale_x_continuous(
    labels = function(x) format(x, scientific = FALSE),
  ) +
  facet_wrap(~ loc)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = NFC, aes(x = AVE_SALARY, y = W.L_PERCENT,color=loc)) +
  geom_point(size = 2.5) +
  geom_smooth(method = lm, se = FALSE, linetype = "solid") +
  labs(
    x = "Average Salary (in millions)",
    y = "Win-Loss Percentage",
    title = "Win-Loss Percentage vs Average Salary in the NFC Conference"
  ) +  scale_x_continuous(
    labels = function(x) format(x, scientific = FALSE)
  ) +
  facet_wrap(~ loc)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = main_df, aes(x = AVE_SALARY, y = W.L_PERCENT)) +
  geom_point(size = 2.5) +
  geom_smooth(method=lm,se=FALSE,linetype="solid")+
  labs(x = "Average Salary", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Region")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(0,2500000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

Initial Look at the Relationship Between Win-Loss Percentage and Average Salary (grouped by year)

ggplot(data = main_df, aes(x = AVE_SALARY, y = W.L_PERCENT,color=YEAR)) +
  geom_point(size = 2.5) +
  geom_smooth(method=lm,se=FALSE,linetype="solid",color="black")+
  labs(x = "Average Salary", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Year")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(0,2500000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))+facet_wrap(~ YEAR)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

nfl_2011_2016<-main_df%>%
  filter(YEAR<2017)

ggplot(data = nfl_2011_2016, aes(x = AVE_SALARY, y = W.L_PERCENT,color=loc)) +
  geom_point(size = 2.5) +
  geom_smooth(method=lm,se=FALSE,linetype="solid",color="black")+
  labs(x = "Average Salary (in millions)", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Year")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(3000000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))+facet_wrap(~ YEAR)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

nfl_2017_2020<-main_df%>%
  filter(YEAR>2016,YEAR<2021)

ggplot(data = nfl_2017_2020, aes(x = AVE_SALARY, y = W.L_PERCENT,color=YEAR)) +
  geom_point(size = 2.5) +
  geom_smooth(method=lm,se=FALSE,linetype="solid",color="black")+
  labs(x = "Average Salary", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Year")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(0,2500000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))+facet_wrap(~ YEAR)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

nfl_2021_2023<-main_df%>%
  filter(YEAR>2020)

ggplot(data = nfl_2021_2023, aes(x = AVE_SALARY, y = W.L_PERCENT,color=YEAR)) +
  geom_point(size = 2.5) +
  geom_smooth(method=lm,se=FALSE,linetype="solid",color="black")+
  labs(x = "Average Salary", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Year")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(0,2500000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))+facet_wrap(~ YEAR)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

Wins vs Total Allocation AFC

ggplot(data = AFC, aes(x = TOTAL_ALLOT, y = WINS,color=loc)) +
  geom_point(size = 2.5) +
  geom_smooth(method=lm,se=FALSE,color='black',linetype="solid")+
  labs(x = "Total Allocation", y = "Wins", title = "Wins vs Total Allocation in the AFC")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `geom_smooth()` using formula = 'y ~ x'

Wins vs Total Allocation NFC

ggplot(data = NFC, aes(x = TOTAL_ALLOT, y = WINS,color=loc)) +
  geom_point(size = 2.5) +
  geom_smooth(method=lm,se=FALSE,color='black',linetype="solid")+
  labs(x = "Total Allocation", y = "Wins", title = "Wins vs Total Allocation in the NFC")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `geom_smooth()` using formula = 'y ~ x'

Summary Statistics

Calculating means, medians, modes, standard deviations, and ranges for discrete and continuous variables.

Discrete variables: Wins, Losses, Win-Loss Percentage, Highest Salary Percent, Lowest Salary Percent

cat("Wins:\n")
## Wins:
summary(main_df$WINS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   6.000   8.000   8.079  10.000  15.000
cat("Losses:\n")
## Losses:
summary(main_df$LOSSES)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   6.000   8.000   7.969  10.000  16.000
cat("High Percent:\n")
## High Percent:
summary(main_df$HIGH_PERCENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.04286 0.08000 0.09846 0.10200 0.12130 0.18564
cat("Low Percent:\n")
## Low Percent:
summary(main_df$LOW_PERCENT)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000100 0.0001900 0.0002750 0.0005283 0.0005900 0.0036200
cat("Win-Loss Percentage:\n")
## Win-Loss Percentage:
summary(main_df$W.L_PERCENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3750  0.5000  0.5031  0.6471  0.9375
carolina_2015<-main_df%>%
  filter(YEAR==2011)

Continuous variables: Total Allocation, Highest Salary, Lowest Salary, High and Low Salary Difference, Average Salary

cat("Total Allocation:\n")
## Total Allocation:
summary(main_df$TOTAL_ALLOT)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  87714299 132019471 161688340 163999166 194275455 232771942
cat("Highest Salary:\n")
## Highest Salary:
summary(main_df$HIGHEST_SALARY)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  5610000 12300000 15850342 16627399 19991750 37133825
cat("Lowest Salary:\n")
## Lowest Salary:
summary(main_df$LOWEST_SALARY)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     667   29118   49080   80339   96225  626664
cat("High and Low Salary Difference:\n")
## High and Low Salary Difference:
summary(main_df$HI_LOW_DIFF)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  5504118 12248240 15752550 16547060 19925657 36977159

Average Salary Variable

cat("Summary Statistics for Average Salary\n")
## Summary Statistics for Average Salary
summary(main_df$AVE_SALARY)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  2857941  6167776  7958974  8353869 10013264 18645246
cat("Standard Deviation:\n")
## Standard Deviation:
sd(main_df$AVE_SALARY)
## [1] 2889497
cat("Variance:\n")
## Variance:
var(main_df$AVE_SALARY)
## [1] 8.349196e+12

The variance for average salary is a large number, indicating a large dispersion in the data.

Specifics

mean(AFC$AVE_SALARY)
## [1] 7977605
mean(NFC$AVE_SALARY)
## [1] 8730133
mean(AFC$WINS)
## [1] 8.019231
mean(NFC$WINS)
## [1] 8.139423
mean(afc_east$AVE_SALARY)
## [1] 6992129
mean(afc_east$WINS)
## [1] 8.346154
mean(afc_west$AVE_SALARY)
## [1] 9101711
mean(afc_west$WINS)
## [1] 8.307692

Hypothesis Testing

MAIN QUESTION: We want to understand the relationship between average salaries and win-loss percentages.

Test Assumptions: 1. Random Sampling: Sample is representative of the population.

  1. Normality of Data: Assumes that the data follows a normal distribution, which is important for parametric tests such as t-tests and ANOVA. If the data is not normally distributed, non-parametric tests might be more appropriate.

  2. Independence of Observations: The observations in the sample should be independent of each other. The value of one observation should not influence the value of another.

  3. Homogeneity of Variance: For tests such as ANOVA that compare variances between groups, an assumption is often made that the variances within each group are roughly equal (homogeneity of variances).

  4. Measurement Scale: The data should be measured on an appropriate scale. For example, if using a t-test, the data should be at least on an interval scale. If using a chi-square test, the data should be categorical.

  5. Null Hypothesis Structure: The null hypothesis should be precise and testable.

  6. Sample Size: Larger sample sizes tend to provide more reliable results and increase the power of statistical tests.

research<-main_df%>%
  select(W.L_PERCENT,AVE_SALARY)

summary(research)
##   W.L_PERCENT       AVE_SALARY      
##  Min.   :0.0000   Min.   : 2857941  
##  1st Qu.:0.3750   1st Qu.: 6167776  
##  Median :0.5000   Median : 7958974  
##  Mean   :0.5031   Mean   : 8353869  
##  3rd Qu.:0.6471   3rd Qu.:10013264  
##  Max.   :0.9375   Max.   :18645246
ggplot(research, aes(x = AVE_SALARY, y = W.L_PERCENT)) +
  geom_point() +
  labs(title = "Relationship Between Average Salary and Winning Percentage",
       x = "Average Salary",
       y = "Winning Percentage") +
  theme_minimal()+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))

Correlation Analysis

cor(research$AVE_SALARY, research$W.L_PERCENT, use = "complete.obs")
## [1] 0.1617668
cor(main_df$HIGHEST_SALARY, main_df$W.L_PERCENT, use = "complete.obs")
## [1] 0.1578862
cor(main_df$TOTAL_ALLOT, main_df$W.L_PERCENT, use = "complete.obs")
## [1] 0.06034338

Linear Regression

model <- lm(W.L_PERCENT ~ AVE_SALARY, data = research)
summary(model)
## 
## Call:
## lm(formula = W.L_PERCENT ~ AVE_SALARY, data = research)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47923 -0.13403  0.00519  0.14523  0.47567 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.132e-01  2.851e-02  14.492  < 2e-16 ***
## AVE_SALARY  1.076e-08  3.226e-09   3.335 0.000929 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1899 on 414 degrees of freedom
## Multiple R-squared:  0.02617,    Adjusted R-squared:  0.02382 
## F-statistic: 11.12 on 1 and 414 DF,  p-value: 0.0009286
# regression line and the scatter plot
ggplot(research, aes(x = AVE_SALARY, y = W.L_PERCENT)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Relationship Between Average Salary and Winning Percentage with Regression Line",
       x = "Average Salary",
       y = "Winning Percentage") +
  theme_minimal()+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `geom_smooth()` using formula = 'y ~ x'

Plotting residuals

plot(model$residuals)

hist(model$residuals)

Checking linearity

plot(model$fitted.values, model$residuals,
     xlab = "Fitted Values",
     ylab = "Residuals",
     main = "Residuals vs. Fitted Values")
abline(h = 0, col = "red")

Linearity assumption is reasonable

Added Variable plots

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
avPlots(model)

plot(main_df$AVE_SALARY, main_df$W.L_PERCENT,
     xlab = "Average Salary",
     ylab = "Winning Percentage",
     main = "Scatterplot of Average Salary vs. Winning Percentage")
abline(model, col = "blue")

Doing Poly Model instead

poly_model <- lm(W.L_PERCENT ~ poly(AVE_SALARY, 2), data = main_df)

summary(poly_model)
## 
## Call:
## lm(formula = W.L_PERCENT ~ poly(AVE_SALARY, 2), data = main_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47850 -0.13659  0.00275  0.14416  0.48200 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.503132   0.009319  53.989  < 2e-16 ***
## poly(AVE_SALARY, 2)1  0.633418   0.190075   3.332 0.000938 ***
## poly(AVE_SALARY, 2)2 -0.099206   0.190075  -0.522 0.601999    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1901 on 413 degrees of freedom
## Multiple R-squared:  0.02681,    Adjusted R-squared:  0.0221 
## F-statistic: 5.689 on 2 and 413 DF,  p-value: 0.003654
ggplot(data = data.frame(fitted = poly_model$fitted.values, residuals = poly_model$residuals),
       aes(x = fitted, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs. Fitted Values", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

Times Series Analysis

ggplot(main_df, aes(x = YEAR, y = W.L_PERCENT)) +
  geom_line() +
  labs(title = "Winning Percentage Over Time",
       x = "Year",
       y = "Winning Percentage") +
  theme_minimal()

# Plot Average Salary over Time
ggplot(main_df, aes(x = YEAR, y = AVE_SALARY)) +
  geom_line() +
  labs(title = "Average Salary Over Time",
       x = "Year",
       y = "Average Salary (in millions)") +
  theme_minimal()+scale_y_continuous(labels=function(x) format(x,scientific=FALSE))

Interpretation and Reporting

\[Y_i = \beta_0 + \beta_1X_i+ \epsilon_i\]

\[Y_i: Win Percentage\]

\[X_i: Average Salary\]